In [1]:
import pandas as pd 
import numpy as np          
import matplotlib.pyplot as plt  
from datetime import datetime    
from pandas import Series       
%matplotlib inline 
import gc
import seaborn as sns
import plotly.express as px
import matplotlib as mpl
mpl.rcParams['agg.path.chunksize'] = 10000


pd.set_option('display.max_columns', None, 'max_colwidth', None, 'display.expand_frame_repr', False) 
plt.style.use("ggplot")
In [2]:
df = pd.read_csv(r"C:\Users\riyad\Documents\mat005\MAT005 - Coursework data.csv")
In [3]:
df
Out[3]:
Date_Incoming_call Incoming_Call_Time DateTime
0 01/01/2017 07:28:00 01/01/2017 07:28
1 02/01/2017 21:25:00 02/01/2017 21:25
2 03/01/2017 07:10:00 03/01/2017 07:10
3 03/01/2017 08:20:00 03/01/2017 08:20
4 03/01/2017 12:14:00 03/01/2017 12:14
... ... ... ...
3737 09/10/2022 10:00:00 09/10/2022 10:00
3738 16/10/2022 07:02:00 16/10/2022 07:02
3739 17/10/2022 20:00:00 17/10/2022 20:00
3740 21/10/2022 05:06:00 21/10/2022 05:06
3741 21/10/2022 12:00:00 21/10/2022 12:00

3742 rows × 3 columns

In [4]:
df.dtypes
#dtypes is objects, therefore needs to be converted into datetime
Out[4]:
Date_Incoming_call    object
Incoming_Call_Time    object
DateTime              object
dtype: object
In [5]:
df["Date_Incoming_call"] = pd.to_datetime(df["Date_Incoming_call"])
C:\Users\riyad\AppData\Local\Temp\ipykernel_6096\2563564867.py:1: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
  df["Date_Incoming_call"] = pd.to_datetime(df["Date_Incoming_call"])
In [6]:
df["DateTime"] = pd.to_datetime(df["DateTime"])
In [7]:
df["DateTime"]
Out[7]:
0      2017-01-01 07:28:00
1      2017-02-01 21:25:00
2      2017-03-01 07:10:00
3      2017-03-01 08:20:00
4      2017-03-01 12:14:00
               ...        
3737   2022-09-10 10:00:00
3738   2022-10-16 07:02:00
3739   2022-10-17 20:00:00
3740   2022-10-21 05:06:00
3741   2022-10-21 12:00:00
Name: DateTime, Length: 3742, dtype: datetime64[ns]
In [ ]:
 
In [8]:
#df["Incoming_Call_Time"] = pd.to_datetime(df["Incoming_Call_Time"], format='%H:%M:%S').dt.time
#pd.to_datetime(day1['time'], format='%H:%M').dt.time '%H:%M:%S'
In [9]:
#df.describe(datetime_is_numeric=True)
In [10]:
print(df.isnull().sum())
Date_Incoming_call    0
Incoming_Call_Time    8
DateTime              0
dtype: int64
In [11]:
nulldat = pd.isnull(df["Incoming_Call_Time"]) 
    
df[nulldat] 
Out[11]:
Date_Incoming_call Incoming_Call_Time DateTime
454 2017-05-30 NaN 2017-05-30
1272 2018-08-06 NaN 2018-08-06
2301 2019-03-10 NaN 2019-03-10
2304 2019-05-10 NaN 2019-05-10
2308 2019-08-10 NaN 2019-08-10
2322 2019-10-19 NaN 2019-10-19
2335 2019-10-28 NaN 2019-10-28
2338 2019-10-29 NaN 2019-10-29
In [12]:
#print(df["Date_Incoming_call"].describe(datetime_is_numeric=True))
#print(df["Incoming_Call_Time"].describe(datetime_is_numeric=True))

df["DateTime"] = df["Date_Incoming_call"] + df["Incoming_Call_Time"]

In [13]:
df[nulldat]
Out[13]:
Date_Incoming_call Incoming_Call_Time DateTime
454 2017-05-30 NaN 2017-05-30
1272 2018-08-06 NaN 2018-08-06
2301 2019-03-10 NaN 2019-03-10
2304 2019-05-10 NaN 2019-05-10
2308 2019-08-10 NaN 2019-08-10
2322 2019-10-19 NaN 2019-10-19
2335 2019-10-28 NaN 2019-10-28
2338 2019-10-29 NaN 2019-10-29
In [14]:
# see if there is any data in the dates where time is missing
print(df.loc[df["Date_Incoming_call"]=="2017-05-30"])
print(df.loc[df["Date_Incoming_call"]=="2018-08-06"])
print(df.loc[df["Date_Incoming_call"]=="2019-03-10"])
print(df.loc[df["Date_Incoming_call"]=="2019-05-10"])
print(df.loc[df["Date_Incoming_call"]=="2019-10-19"])
print(df.loc[df["Date_Incoming_call"]=="2019-10-28"])
print(df.loc[df["Date_Incoming_call"]=="2019-10-29"])
    Date_Incoming_call Incoming_Call_Time            DateTime
452         2017-05-30           08:00:00 2017-05-30 08:00:00
453         2017-05-30           19:10:00 2017-05-30 19:10:00
454         2017-05-30                NaN 2017-05-30 00:00:00
     Date_Incoming_call Incoming_Call_Time            DateTime
1269         2018-08-06           06:05:00 2018-08-06 06:05:00
1270         2018-08-06           10:50:00 2018-08-06 10:50:00
1271         2018-08-06           11:55:00 2018-08-06 11:55:00
1272         2018-08-06                NaN 2018-08-06 00:00:00
     Date_Incoming_call Incoming_Call_Time            DateTime
2299         2019-03-10           06:00:00 2019-03-10 06:00:00
2300         2019-03-10           17:45:00 2019-03-10 17:45:00
2301         2019-03-10                NaN 2019-03-10 00:00:00
     Date_Incoming_call Incoming_Call_Time   DateTime
2304         2019-05-10                NaN 2019-05-10
     Date_Incoming_call Incoming_Call_Time   DateTime
2322         2019-10-19                NaN 2019-10-19
     Date_Incoming_call Incoming_Call_Time            DateTime
2333         2019-10-28           08:50:00 2019-10-28 08:50:00
2334         2019-10-28           09:00:00 2019-10-28 09:00:00
2335         2019-10-28                NaN 2019-10-28 00:00:00
     Date_Incoming_call Incoming_Call_Time            DateTime
2336         2019-10-29           08:45:00 2019-10-29 08:45:00
2337         2019-10-29           11:30:00 2019-10-29 11:30:00
2338         2019-10-29                NaN 2019-10-29 00:00:00
In [15]:
df =df.dropna().reset_index(drop=True)
In [ ]:
 
In [16]:
sns.displot(df["DateTime"],color="m")
plt.show()
In [17]:
sns.kdeplot(df["DateTime"],color="m" ,shade =True)


plt.title("Number of Calls")
plt.show()
In [18]:
#null values are removed, cleaned data will be exported back to excel to use pivot tables
In [19]:
#df.to_csv('cleaned_data.csv')
In [20]:
clean_data = pd.read_csv(r"C:\Users\riyad\Documents\mat005\sumdat.csv")
In [21]:
clean_data.dtypes
Out[21]:
Date_Of_Call                   object
Count of Date_Incoming_call     int64
dtype: object
In [22]:
clean_data["Date_Of_Call"] = pd.to_datetime(clean_data["Date_Of_Call"])
C:\Users\riyad\AppData\Local\Temp\ipykernel_6096\1168944757.py:1: UserWarning: Parsing dates in DD/MM/YYYY format when dayfirst=False (the default) was specified. This may lead to inconsistently parsed dates! Specify a format to ensure consistent parsing.
  clean_data["Date_Of_Call"] = pd.to_datetime(clean_data["Date_Of_Call"])
In [23]:
clean_data.describe()
Out[23]:
Count of Date_Incoming_call
count 1584.000000
mean 2.357323
std 1.364137
min 1.000000
25% 1.000000
50% 2.000000
75% 3.000000
max 11.000000
In [24]:
clean_data.set_index(["Date_Of_Call"])
Out[24]:
Count of Date_Incoming_call
Date_Of_Call
2017-01-01 1
2017-01-02 1
2017-01-03 5
2017-01-04 3
2017-01-05 7
... ...
2022-10-07 1
2022-10-09 1
2022-10-16 1
2022-10-17 1
2022-10-21 2

1584 rows × 1 columns

In [25]:
x = clean_data["Date_Of_Call"]
y = clean_data["Count of Date_Incoming_call"]

plt.figure(figsize=(35,10))
plt.plot(x,y)

#just a general plot to see data, it is congested so we will look at monthly and yearly scale
Out[25]:
[<matplotlib.lines.Line2D at 0x1f086413040>]
In [26]:
plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('M').mean()["Count of Date_Incoming_call"].plot()
plt.ylabel("Mean Number of Calls")
plt.xlabel("Date")
plt.title("Average Monthly Calls")
plt.show()

clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('Y').mean()["Count of Date_Incoming_call"].plot()
plt.ylabel("Mean Number of Calls")
plt.xlabel("Date")
plt.title("Average Yearly Calls")
plt.show()

plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('W').mean()["Count of Date_Incoming_call"].plot()
plt.ylabel("Mean Number of Calls")
plt.xlabel("Date")
plt.title("Average Weekly Calls")
plt.show()
In [27]:
plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('M').std()["Count of Date_Incoming_call"].plot()
plt.ylabel("Standard Deviation of Calls")
plt.xlabel("Date")
plt.title("Standard Deviation of Monthly Calls")
plt.show()


clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('Y').std()["Count of Date_Incoming_call"].plot()
plt.ylabel("Standard Deviation of Calls")
plt.xlabel("Date")
plt.title("Standard Deviation of Yearly Calls")
plt.show()

plt.figure(figsize=(20,5))
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('W').std()["Count of Date_Incoming_call"].plot()
plt.ylabel("Standard Deviation of Calls")
plt.xlabel("Date")
plt.title("Standard Deviation of Weekly Calls")
plt.show()

as we are working we want the date to be in weeks, the date will be converted into weeks.¶

clean_data["Week"] = clean_data["Date_Of_Call"].dt.isocalendar().week

In [28]:
clean_data[["Date_Of_Call", "Count of Date_Incoming_call"]].set_index("Date_Of_Call").resample('M').sum()["Count of Date_Incoming_call"].plot()
plt.ylabel("Sum Number of Calls")
plt.xlabel("Date")
plt.title("Monthly Calls")
plt.figure(figsize=(35,8))
plt.show()
<Figure size 3500x800 with 0 Axes>
In [ ]:
 

Monthly Calls Per Year Graphs Made in Excel:¶

Attached below:

Monthly Trend of 2017 Calls

2017%20graph-2.png

Monthly Trend of 2018 Calls 2018%20graph-2.png

Monthly Trend of 2019 Calls 2019%20graph-2.png

Monthly Trend of 2020 Calls 2020_graph-2.png

Monthly Trend of 2021 Calls 2021_graph-2.png

Monthly Trend of 2022 Calls 2022_graph-2.png

Quarterly Range Quarterly%20Data%20Range.png

Rolling Statistics¶

In [29]:
#convert data to monthly scale
clean_data.set_index("Date_Of_Call", inplace=True)
monthly_data = clean_data.resample('M').mean()["Count of Date_Incoming_call"]
monthly_data.plot()
Out[29]:
<AxesSubplot:xlabel='Date_Of_Call'>

after looking at the graphs, we'll look at the trends on a monthly scale¶

In [30]:
m3 = pd.read_excel(r"C:\Users\riyad\Documents\mat005\cleaned_data.xlsx", sheet_name="new_month_data")
m3.set_index("Month")
Out[30]:
Sum_Calls
Month
2017-01-01 91
2017-02-01 93
2017-03-01 87
2017-04-01 76
2017-05-01 84
... ...
2022-06-01 32
2022-07-01 38
2022-08-01 69
2022-09-01 42
2022-10-01 15

70 rows × 1 columns

In [31]:
m3["roll_mean"] = m3["Sum_Calls"].rolling(window=12).mean()
m3["roll_std"] = m3["Sum_Calls"].rolling(window=12).std()
m3.head()
Out[31]:
Month Sum_Calls roll_mean roll_std
0 2017-01-01 91 NaN NaN
1 2017-02-01 93 NaN NaN
2 2017-03-01 87 NaN NaN
3 2017-04-01 76 NaN NaN
4 2017-05-01 84 NaN NaN
In [32]:
x = m3["Month"]
y = m3["Sum_Calls"]
plt.ylabel("Sum of Calls per Month")
plt.xlabel("Date")
normal =plt.plot(x,y,"c",label ="Data")

Tests for Stationarity¶

Rolling Statistics¶

In [33]:
import statsmodels.api as sm
In [34]:
plt.figure(figsize=(20,5))
#Normal Monthly Data
sns.lineplot( x = "Month",y = "Sum_Calls", data = m3, label = 'Volume of Calls')
plt.xlabel = "Data Range"
plt.ylabel = "Volume of Calls"
#Rolling Mean
sns.lineplot( x = "Month", y = "roll_mean", data = m3, label = 'Rolling Mean')
#Rolling Std
sns.lineplot( x = "Month", y = "roll_std", data = m3, label = 'Rolling Standard Deviation')
plt.xlabel = "Data Range"
plt.ylabel = "Volume of Calls"
plt.show() 

Dickey-Fuller Test¶

In [35]:
#data is non-stationary as evident above
#apply dickey-fuller test for stationary
from statsmodels.tsa.stattools import adfuller

dftest = adfuller(m3["Sum_Calls"],autolag ="AIC")
dfresult = pd.Series(dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in dftest[4].items():
    dfresult["Critical Value (%s)"%key] = value
print(dfresult)
ADF Statistic:                 -2.423050
p-value                         0.135336
Number of Lags                  1.000000
Number of Observations Used    68.000000
Critical Value (1%)            -3.530399
Critical Value (5%)            -2.905087
Critical Value (10%)           -2.590001
dtype: float64

we can see that from the test, the p value is greater than 0.05, therefore the data is non-stationary

Log Observation:¶

In [36]:
drop_data = m3.drop(columns= ["roll_mean","roll_std"]).copy()

drop_data["Log_Values"]=np.log(drop_data["Sum_Calls"])

drop_data.dtypes
Out[36]:
Month         datetime64[ns]
Sum_Calls              int64
Log_Values           float64
dtype: object

drop_data = drop_data.set_index("Month") drop_data["Sum_Calls"] =drop_data["Sum_Calls"].resample('W').sum() drop_data

In [37]:
#drop_data["Log_Values"]=np.log(drop_data["Sum_Calls"])
In [38]:
df3 =clean_data.copy()
df3
Out[38]:
Count of Date_Incoming_call
Date_Of_Call
2017-01-01 1
2017-01-02 1
2017-01-03 5
2017-01-04 3
2017-01-05 7
... ...
2022-10-07 1
2022-10-09 1
2022-10-16 1
2022-10-17 1
2022-10-21 2

1584 rows × 1 columns

In [39]:
df3.dtypes
Out[39]:
Count of Date_Incoming_call    int64
dtype: object
In [40]:
df3 =df3.resample("W").sum()
df3
Out[40]:
Count of Date_Incoming_call
Date_Of_Call
2017-01-01 1
2017-01-08 25
2017-01-15 22
2017-01-22 22
2017-01-29 21
... ...
2022-09-25 15
2022-10-02 7
2022-10-09 3
2022-10-16 1
2022-10-23 3

304 rows × 1 columns

In [ ]:
 
In [41]:
drop_data = m3.drop(columns= ["roll_mean","roll_std"]).copy()
#drop_data = drop_data.set_index("Month")
drop_data["Log_Values"]=np.log(drop_data["Sum_Calls"])

log_data= drop_data[["Month","Log_Values"]].copy()
log_data=log_data.set_index("Month")

plt.plot(log_data)
Out[41]:
[<matplotlib.lines.Line2D at 0x1f08a2fa4c0>]
In [ ]:
 
In [114]:
logmean = log_data.rolling(window=12).mean()
logstd = log_data.rolling(window=12).std()
plt.plot(log_data,label="Log Value")
plt.plot(logmean, label="Log Rolling Mean")
plt.plot(logstd,label="Log Rolling Std")
plt.legend()
Out[114]:
<matplotlib.legend.Legend at 0x1f09056af10>

Differences¶

In [43]:
plt.figure(figsize=(16,5))
ma_diff = log_data - logmean
ma_diff.dropna(inplace=True)
madiffmean= ma_diff.rolling(window=12).mean()
madiffstd = ma_diff.rolling(window=12).std()
plt.plot(ma_diff, label="Difference")
plt.plot(madiffmean, label="Rolling Mean")
plt.plot(madiffstd, label="Rolling STD")
plt.legend()
Out[43]:
<matplotlib.legend.Legend at 0x1f08a2bac70>
In [44]:
#ADF Test for log transformation
log_dftest = adfuller(ma_diff,autolag ="AIC")
log_dfresult = pd.Series(log_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in log_dftest[4].items():
    log_dfresult["Critical Value (%s)"%key] = value
print(log_dfresult)
ADF Statistic:                 -3.286675
p-value                         0.015485
Number of Lags                  2.000000
Number of Observations Used    56.000000
Critical Value (1%)            -3.552928
Critical Value (5%)            -2.914731
Critical Value (10%)           -2.595137
dtype: float64
In [45]:
log_data
Out[45]:
Log_Values
Month
2017-01-01 4.510860
2017-02-01 4.532599
2017-03-01 4.465908
2017-04-01 4.330733
2017-05-01 4.430817
... ...
2022-06-01 3.465736
2022-07-01 3.637586
2022-08-01 4.234107
2022-09-01 3.737670
2022-10-01 2.708050

70 rows × 1 columns

Seasonality¶

Based off the excel graphs, the overall trend per year does not show that much seasonality, except for negative trends between Jan-Feb for 2018-2020 & 2022. We can use a decomposition plot to determine seasonality

In [46]:
period = len(m3)/2
period
Out[46]:
35.0
In [127]:
## Decomposition Plot
decomposition = sm.tsa.seasonal_decompose(drop_data["Sum_Calls"], model="additive",extrapolate_trend='freq',period =35)
decom_plot =decomposition.plot()
plt.figure(figsize=(30,6))

#we use period of 35 as we have 70 observations, dataframe is unnested, hence we use length df / 2
#https://stackoverflow.com/questions/60017052/decompose-for-time-series-valueerror-you-must-specify-a-period-or-x-must-be
Out[127]:
<Figure size 3000x600 with 0 Axes>
<Figure size 3000x600 with 0 Axes>

Trend¶

In order to see the trend in the log time series, we need to take the exponential weighted average of the rolling mean

In [48]:
exp_avg = log_data.ewm(halflife=12,min_periods=0,ignore_na=False).mean()
In [125]:
plt.plot(log_data, label="Log Transformed Data")
plt.plot(exp_avg, label ="Exponential Weighted Avg")
plt.legend()
plt.title("Exp Trend")
Out[125]:
Text(0.5, 1.0, 'Exp Trend')

De-Trending¶

In [119]:
exp_diff = log_data - exp_avg
exp_diff.dropna(inplace=True)
expdiffmean= exp_diff.rolling(window=12).mean()
expdiffstd = exp_diff.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(exp_diff, label="De-Trended Plot")
plt.plot(expdiffmean, label="Rolling Mean")
plt.plot(expdiffstd, label="Rolling STD")
plt.legend()
plt.title("De-Trend Using Exponential Weighted Average")
exp_dftest = adfuller(exp_diff,autolag ="AIC")
exp_dfresult = pd.Series(exp_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in exp_dftest[4].items():
    exp_dfresult["Critical Value (%s)"%key] = value
print(exp_dfresult)
ADF Statistic:                 -4.722287
p-value                         0.000076
Number of Lags                  0.000000
Number of Observations Used    69.000000
Critical Value (1%)            -3.528890
Critical Value (5%)            -2.904440
Critical Value (10%)           -2.589656
dtype: float64

here we have removed trend from the log transformation of the time series

In [ ]:
 

Difference by shifting¶

In [51]:
ts_diff = log_data-log_data.shift(12)
plt.plot(ts_diff)
Out[51]:
[<matplotlib.lines.Line2D at 0x1f08a2874c0>]

here the data has been shifted by 1 lag

In [118]:
ts_diff.dropna(inplace=True)
ts_diffmean= ts_diff.rolling(window=12).mean()
ts_diffstd = ts_diff.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(ts_diff, label="Shifted Plot")
plt.plot(ts_diffmean, label="Rolling Mean")
plt.plot(ts_diffstd, label="Rolling STD")
plt.legend()
plt.title("Log Difference Shift")
ts_dftest = adfuller(ts_diff,autolag ="AIC")
ts_dfresult = pd.Series(ts_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in ts_dftest[4].items():
    ts_dfresult["Critical Value (%s)"%key] = value
print(ts_dfresult)
ADF Statistic:                 -3.171402
p-value                         0.021701
Number of Lags                  0.000000
Number of Observations Used    57.000000
Critical Value (1%)            -3.550670
Critical Value (5%)            -2.913766
Critical Value (10%)           -2.594624
dtype: float64

Shifting from original plot¶

In [53]:
y = drop_data["Sum_Calls"]
y_diff = y-y.shift(12)
plt.plot(y_diff)
Out[53]:
[<matplotlib.lines.Line2D at 0x1f08bb41310>]
In [117]:
y_diff.dropna(inplace=True)
y_diffmean= y_diff.rolling(window=12).mean()
y_diffstd = y_diff.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(y_diff, label="Shifted Plot")
plt.plot(y_diffmean, label="Rolling Mean")
plt.plot(y_diffstd, label="Rolling STD")
plt.legend()
plt.title("Shifting Via Original Data")
y_dftest = adfuller(y_diff,autolag ="AIC")
y_dfresult = pd.Series(y_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in y_dftest[4].items():
    y_dfresult["Critical Value (%s)"%key] = value
print(y_dfresult)
ADF Statistic:                 -3.393997
p-value                         0.011165
Number of Lags                  0.000000
Number of Observations Used    57.000000
Critical Value (1%)            -3.550670
Critical Value (5%)            -2.913766
Critical Value (10%)           -2.594624
dtype: float64

De-trend from original plot¶

In [55]:
y_trend = (y - y.rolling(window=12).mean())/y.rolling(window=12).std()
plt.plot(y_trend)
Out[55]:
[<matplotlib.lines.Line2D at 0x1f08bba7ee0>]
In [120]:
y_trend.dropna(inplace=True)
y_trendmean= y_trend.rolling(window=12).mean()
y_trendstd =y_trend.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.plot(y_trend, label="De-Trended Plot")
plt.plot(y_trendmean, label="Rolling Mean")
plt.plot(y_trendstd, label="Rolling STD")
plt.legend()
plt.title("De-Trending with Original Data")
y_trend_dftest = adfuller(y_trend,autolag ="AIC")
y_trend_dfresult = pd.Series(y_trend_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in y_trend_dftest[4].items():
    y_trend_dfresult["Critical Value (%s)"%key] = value
print(y_trend_dfresult)
ADF Statistic:                 -3.299364
p-value                         0.014907
Number of Lags                  2.000000
Number of Observations Used    56.000000
Critical Value (1%)            -3.552928
Critical Value (5%)            -2.914731
Critical Value (10%)           -2.595137
dtype: float64

Log Decompostion Plot¶

In [123]:
log_decomp = sm.tsa.seasonal_decompose(log_data, model="multiplicative",extrapolate_trend='freq',period =35)
log_decomp_plot =log_decomp.plot()
In [58]:
ts_log_decomp = log_decomp.resid
ts_log_decomp.dropna(inplace=True)
ts_log_decompmean= ts_log_decomp.rolling(window=12).mean()
ts_log_decompstd = ts_log_decomp.rolling(window=12).std()
plt.figure(figsize=(16,5))
plt.title("Test For Noise Using Residuals")
plt.plot(ts_log_decomp, label="Difference")
plt.plot(ts_log_decompmean, label="Rolling Mean")
plt.plot(ts_log_decompstd, label="Rolling STD")
plt.legend()
ts_log_decomp_dftest = adfuller(ts_log_decomp,autolag ="AIC")
ts_log_decomp_dfresult = pd.Series(ts_log_decomp_dftest[0:4], index=["ADF Statistic:","p-value","Number of Lags",'Number of Observations Used'])
for key,value in ts_log_decomp_dftest[4].items():
    ts_log_decomp_dfresult["Critical Value (%s)"%key] = value
print(ts_log_decomp_dfresult)
ADF Statistic:                 -3.184199
p-value                         0.020918
Number of Lags                  1.000000
Number of Observations Used    68.000000
Critical Value (1%)            -3.530399
Critical Value (5%)            -2.905087
Critical Value (10%)           -2.590001
dtype: float64

Data is not stationary as the test statistic is greater than the 1% critical value, as we use the residuals to measure irregularities in the data

Autocorrelation and Partial Autocorrelation Plot¶

ACF & PACF will be implemented to further test seasonality

In [59]:
from statsmodels.tsa.stattools import acf, pacf
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(ts_diff)
plot_pacf(ts_diff)
C:\Users\riyad\anaconda3\lib\site-packages\statsmodels\graphics\tsaplots.py:348: FutureWarning: The default method 'yw' can produce PACF values outside of the [-1,1] interval. After 0.13, the default will change tounadjusted Yule-Walker ('ywm'). You can use this method now by setting method='ywm'.
  warnings.warn(
Out[59]:

Split Data into Train & Validation¶

In [60]:
df3 =clean_data.copy()
df3
Out[60]:
Count of Date_Incoming_call
Date_Of_Call
2017-01-01 1
2017-01-02 1
2017-01-03 5
2017-01-04 3
2017-01-05 7
... ...
2022-10-07 1
2022-10-09 1
2022-10-16 1
2022-10-17 1
2022-10-21 2

1584 rows × 1 columns

In [61]:
df3.dtypes
Out[61]:
Count of Date_Incoming_call    int64
dtype: object

The test data should range for 8 weeks, in order to predict the next 8 weeks

In [62]:
#for test:
df3 =df3.resample("W").sum()
df3.tail(9)
Out[62]:
Count of Date_Incoming_call
Date_Of_Call
2022-08-28 18
2022-09-04 21
2022-09-11 29
2022-09-18 19
2022-09-25 15
2022-10-02 7
2022-10-09 3
2022-10-16 1
2022-10-23 3
In [63]:
valid = df3.loc["2022-09-04":"2022-10-23"]
train = df3.loc["2017-01-01":"2022-09-04"]
In [64]:
plt.figure(figsize=(20,8))
plt.plot(train,"b",label="Train")
plt.plot(valid, "y",label="Validation")
#plt.xlabel("Datetime") 
#plt.ylabel("Number of Calls") 
plt.legend(loc='best')
plt.title("Test&Train Split")
plt.show()

Naive Approach¶

In [65]:
model_df = valid.copy()
model_df["Naive_Forcast"] = train["Count of Date_Incoming_call"][len(train)-1]
In [66]:
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["Naive_Forcast"],"g", label="Naive forecast")
plt.legend(loc="best")
plt.title("Naive Approach")
plt.show()
In [67]:
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import mean_absolute_error
from math import sqrt 
naive_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["Naive_Forcast"])) 
naive_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["Naive_Forcast"])
print("RMSE:",naive_rmse)
print("MAPE:",naive_mape)
RMSE: 12.98075498574717
MAPE: 10.75

Moving Average¶

Here we will use 8 as the value of rolling window as we are forecasting 8 weeks

In [68]:
model_df = valid.copy() 
model_df["Moving_Average"] = train["Count of Date_Incoming_call"].rolling(window=8).mean().iloc[-1] 
In [69]:
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["Moving_Average"],"g", label="Moving Average")
plt.legend(loc="best")
plt.title("Moving Average Forecast")
plt.show()
In [70]:
ma_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["Moving_Average"])) 
ma_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["Moving_Average"])
print("RMSE:",ma_rmse)
print("MAPE:",ma_mape)
RMSE: 9.878164050065173
MAPE: 8.75

Simple Exponential Smoothing¶

In [71]:
#as we experience trend in our series, SES would not be a suitable model to implement. however we will test this
In [72]:
from statsmodels.tsa.holtwinters import SimpleExpSmoothing   
# we need to define the value of alpha, we will set the alpha as 1/(2*m) where m = time period, our time period will be 8 weeks
m =8
alpha = 1/(2*8)
model_df =valid.copy()
ses_model = SimpleExpSmoothing(train["Count of Date_Incoming_call"]).fit(smoothing_level=alpha,optimized=False)
model_df["SES"] = ses_model.forecast(len(valid))
In [73]:
ses_model.summary()
Out[73]:
SimpleExpSmoothing Model Results
Dep. Variable: Count of Date_Incoming_call No. Observations: 297
Model: SimpleExpSmoothing SSE 13102.481
Optimized: False AIC 1128.687
Trend: None BIC 1136.074
Seasonal: None AICC 1128.824
Seasonal Periods: None Date: Fri, 05 May 2023
Box-Cox: False Time: 12:33:26
Box-Cox Coeff.: None
coeff code optimized
smoothing_level 0.0625000 alpha False
initial_level 1.0000000 l.0 False
In [74]:
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["SES"],"g", label="Simple Exponential Smoothing")
plt.legend(loc="best")
plt.title("SES Forecast")
plt.show()
In [75]:
model_df["SES"]
model_df["SES"]= model_df["SES"].fillna(8.161579)
print(model_df["SES"])
Date_Of_Call
2022-09-04    8.161579
2022-09-11    8.161579
2022-09-18    8.161579
2022-09-25    8.161579
2022-10-02    8.161579
2022-10-09    8.161579
2022-10-16    8.161579
2022-10-23    8.161579
Freq: W-SUN, Name: SES, dtype: float64
In [76]:
ses_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["SES"])) 
ses_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["SES"])
print("RMSE:",ses_rmse)
print("MAPE:",ses_mape)
RMSE: 10.423659895611774
MAPE: 8.750000016454212

Holt-Winters’ Seasonal Method¶

In [77]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing

the model would not work as some of the weekly values equalled to zero in the train data, as there were no data for days in those weeks

In [78]:
train["Count of Date_Incoming_call"].loc[(train["Count of Date_Incoming_call"])==0]
Out[78]:
Date_Of_Call
2019-02-10    0
2019-12-08    0
2019-12-15    0
2019-12-22    0
2019-12-29    0
2020-12-20    0
2020-12-27    0
2022-08-07    0
Name: Count of Date_Incoming_call, dtype: int64
In [79]:
date_list = [datetime(2019, 2, 10),
             datetime(2019, 12, 8),
             datetime(2019, 12, 15),
             datetime(2019, 12, 22),
             datetime(2019, 12, 29),
             datetime(2020, 12, 20),
             datetime(2020, 12, 27),
             datetime(2022, 8, 7),
            ]
In [80]:
train_2 = train.drop(date_list)
train_2.dtypes
Out[80]:
Count of Date_Incoming_call    int64
dtype: object
In [81]:
train_2
Out[81]:
Count of Date_Incoming_call
Date_Of_Call
2017-01-01 1
2017-01-08 25
2017-01-15 22
2017-01-22 22
2017-01-29 21
... ...
2022-07-31 7
2022-08-14 7
2022-08-21 17
2022-08-28 18
2022-09-04 21

289 rows × 1 columns

In [82]:
train_2.index=pd.DatetimeIndex(train_2.index).to_period('W')
In [83]:
hwse_model = ExponentialSmoothing(train_2,trend ="add",seasonal ="mul",seasonal_periods=36).fit()
#from the decomposition plot, we have 2 seasonal periods, therefore we will put this into the paramaters of the model
model_df["HWSE"] = hwse_model.forecast(len(valid))
In [84]:
model_df["HWSE"]
Out[84]:
Date_Of_Call
2022-09-04   NaN
2022-09-11   NaN
2022-09-18   NaN
2022-09-25   NaN
2022-10-02   NaN
2022-10-09   NaN
2022-10-16   NaN
2022-10-23   NaN
Freq: W-SUN, Name: HWSE, dtype: float64
In [85]:
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["HWSE"],"g", label="Holt Winter Method")
plt.legend(loc="best")
plt.title("Holt Winter Forecast")
plt.show()
In [86]:
hwse_model.summary()
Out[86]:
ExponentialSmoothing Model Results
Dep. Variable: Count of Date_Incoming_call No. Observations: 289
Model: ExponentialSmoothing SSE 5303.144
Optimized: True AIC 920.883
Trend: Additive BIC 1067.540
Seasonal: Multiplicative AICC 935.566
Seasonal Periods: 36 Date: Fri, 05 May 2023
Box-Cox: False Time: 12:33:27
Box-Cox Coeff.: None
coeff code optimized
smoothing_level 0.4495968 alpha True
smoothing_trend 2.9807e-10 beta True
smoothing_seasonal 3.4581e-10 gamma True
initial_level 10.847656 l.0 True
initial_trend 0.0213134 b.0 True
initial_seasons.0 1.1030563 s.0 True
initial_seasons.1 1.0197384 s.1 True
initial_seasons.2 1.2875107 s.2 True
initial_seasons.3 1.3537338 s.3 True
initial_seasons.4 1.1128906 s.4 True
initial_seasons.5 1.3129743 s.5 True
initial_seasons.6 1.5035589 s.6 True
initial_seasons.7 1.2515672 s.7 True
initial_seasons.8 1.4424032 s.8 True
initial_seasons.9 1.4961853 s.9 True
initial_seasons.10 1.2179177 s.10 True
initial_seasons.11 1.4119162 s.11 True
initial_seasons.12 1.3938413 s.12 True
initial_seasons.13 1.4703230 s.13 True
initial_seasons.14 1.4563917 s.14 True
initial_seasons.15 1.3901391 s.15 True
initial_seasons.16 1.4909692 s.16 True
initial_seasons.17 1.2790844 s.17 True
initial_seasons.18 1.5066231 s.18 True
initial_seasons.19 1.3552973 s.19 True
initial_seasons.20 1.1947756 s.20 True
initial_seasons.21 0.9346972 s.21 True
initial_seasons.22 0.9289004 s.22 True
initial_seasons.23 1.2427760 s.23 True
initial_seasons.24 0.9234554 s.24 True
initial_seasons.25 0.9857903 s.25 True
initial_seasons.26 1.0587609 s.26 True
initial_seasons.27 0.9610390 s.27 True
initial_seasons.28 0.8203296 s.28 True
initial_seasons.29 0.7158517 s.29 True
initial_seasons.30 0.8293941 s.30 True
initial_seasons.31 0.7263256 s.31 True
initial_seasons.32 0.8367579 s.32 True
initial_seasons.33 0.7320130 s.33 True
initial_seasons.34 1.0538373 s.34 True
initial_seasons.35 1.0765308 s.35 True

ARIMA Model¶

In [87]:
from statsmodels.tsa.arima.model import ARIMA
In [88]:
arima_model = ARIMA(train["Count of Date_Incoming_call"],order=(1,0,2)).fit()
In [89]:
model_df["ARIMA"] = arima_model.forecast(8)
model_df["ARIMA"]
model_df["ARIMA"]= model_df["ARIMA"].fillna(18.3)
In [90]:
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(model_df["ARIMA"],"g", label="ARIMA Model")
plt.legend(loc="best")
plt.title("ARIMA Forecast")
plt.show()
In [91]:
ARIMA_rmse = sqrt(mean_squared_error(valid["Count of Date_Incoming_call"], model_df["ARIMA"])) 
ARIMA_mape = mean_absolute_error(valid["Count of Date_Incoming_call"],model_df["ARIMA"])
print("RMSE:",ARIMA_rmse)
print("MAPE:",ARIMA_mape)
RMSE: 9.305601100537038
MAPE: 7.97205743285917
In [92]:
model_df["ARIMA"]
Out[92]:
Date_Of_Call
2022-09-04    18.300000
2022-09-11    17.009294
2022-09-18    15.672935
2022-09-25    15.404600
2022-10-02    15.158561
2022-10-09    14.932965
2022-10-16    14.726113
2022-10-23    14.536448
Freq: W-SUN, Name: ARIMA, dtype: float64
In [93]:
result =model_df["ARIMA"]
result
Out[93]:
Date_Of_Call
2022-09-04    18.300000
2022-09-11    17.009294
2022-09-18    15.672935
2022-09-25    15.404600
2022-10-02    15.158561
2022-10-09    14.932965
2022-10-16    14.726113
2022-10-23    14.536448
Freq: W-SUN, Name: ARIMA, dtype: float64
In [ ]:
 

from statsmodels.graphics.tsaplots import plot_predict plot_predict(result, start=2022-10-24, end=2022-12-18)

In [94]:
new_log = log_data.copy()
new_log
Out[94]:
Log_Values
Month
2017-01-01 4.510860
2017-02-01 4.532599
2017-03-01 4.465908
2017-04-01 4.330733
2017-05-01 4.430817
... ...
2022-06-01 3.465736
2022-07-01 3.637586
2022-08-01 4.234107
2022-09-01 3.737670
2022-10-01 2.708050

70 rows × 1 columns

In [95]:
new_log.dtypes
new_log.index = pd.DatetimeIndex(new_log.index.values,
                               freq=new_log.index.inferred_freq)
In [ ]:
 
In [96]:
log_arima_model = ARIMA(new_log,order=(1,0,2))
logAresult = log_arima_model.fit()
In [97]:
plt.figure(figsize=(20,8))
plt.plot(log_data,"b" ,label="Log Original Data")
plt.plot(logAresult.fittedvalues,"g", label="Log ARIMA Model")
plt.legend(loc="best")
plt.title("Log ARIMA Forecast")
plt.show()

LogARIMA_rmse = sqrt(mean_squared_error(log_data["Log_Values"],logAresult.fittedvalues)) LogARIMA_mape = mean_absolute_error(log_data["Log_Values"],logAresult.fittedvalues) print("RMSE:",LogARIMA_rmse) print("MAPE:",LogARIMA_mape)

In [98]:
arima_model2 = ARIMA(df3["Count of Date_Incoming_call"],order=(1,0,2)).fit()
In [99]:
res3 =arima_model2.fittedvalues
res3
Out[99]:
Date_Of_Call
2017-01-01    12.040111
2017-01-08     4.132902
2017-01-15    18.380619
2017-01-22    17.321799
2017-01-29    19.069516
                ...    
2022-09-25    18.000037
2022-10-02    16.130886
2022-10-09    10.779558
2022-10-16     7.546969
2022-10-23     5.188549
Freq: W-SUN, Length: 304, dtype: float64
In [100]:
plt.figure(figsize=(20,8))
plt.plot(train["Count of Date_Incoming_call"],"b" ,label="Train")
plt.plot(valid["Count of Date_Incoming_call"], "y",label="Valid")
plt.plot(res3,"g", label="ARIMA Model")
plt.legend(loc="best")
plt.title("ARIMA Forecast")
plt.show()
In [124]:
from statsmodels.graphics.tsaplots import plot_predict
fig, (ax) = plt.subplots()
ax= df3["Count of Date_Incoming_call"].plot(ax=ax)
plot_predict(arima_model2, "2022-10-24", "2022-12-18", ax=ax)
plt.title("Arima Forecast")
Out[124]:
Text(0.5, 1.0, 'Arima Forecast')
In [112]:
week1 =arima_model2.predict("2022-10-24","2022-10-30")
print("Prediction for Week 1:",week1)
week2= arima_model2.predict("2022-10-31", "2022-11-06")
print("Prediction for Week 2:",week2)
week3= arima_model2.predict("2022-11-07", "2022-11-13")
print("Prediction for Week 3:",week3)
week4= arima_model2.predict("2022-11-14", "2022-11-20")
print("Prediction for Week 4:",week4)
week5= arima_model2.predict("2022-11-21", "2022-11-27")
print("Prediction for Week 5:",week5)
week6= arima_model2.predict("2022-11-28", "2022-12-04")
print("Prediction for Week 6:",week6)
week7= arima_model2.predict("2022-12-05", "2022-12-11")
print("Prediction for Week 7:",week7)
week8= arima_model2.predict("2022-12-12", "2022-12-18")
print("Prediction for Week 8:",week8)
Prediction for Week 1: 2022-10-30    5.451387
Freq: W-SUN, dtype: float64
Prediction for Week 2: 2022-11-06    6.403464
Freq: W-SUN, dtype: float64
Prediction for Week 3: 2022-11-13    6.977868
Freq: W-SUN, dtype: float64
Prediction for Week 4: 2022-11-20    7.493737
Freq: W-SUN, dtype: float64
Prediction for Week 5: 2022-11-27    7.957037
Freq: W-SUN, dtype: float64
Prediction for Week 6: 2022-12-04    8.373124
Freq: W-SUN, dtype: float64
Prediction for Week 7: 2022-12-11    8.746809
Freq: W-SUN, dtype: float64
Prediction for Week 8: 2022-12-18    9.082414
Freq: W-SUN, dtype: float64
In [111]:
 
Out[111]:
2022-10-30    5.451387
Freq: W-SUN, dtype: float64
In [ ]: